%%capture
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas_profiling
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "https://www.researchgate.net/profile/Joaquin-Torres-Sospedra/publication/283894296/figure/fig7/AS:676977799331845@1538415493912/Map-of-the-UJI-Riu-Sec-Campus-and-zoom-on-the-Tx-Buildings-Pink-refers-to-the-ESTCE-Tx.png",
width=9000, height=200)
%%capture
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import warnings
from matplotlib.pyplot import figure
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
warnings.filterwarnings('ignore')
Importing our datasets and transforming the WAP values for better interpretation. "0" will represent that the WAP was not detected, and it will scale from "1" (extremely poor signal) to "105" (extremely good signal).
df = pd.read_csv(r'C:\Users\andre\OneDrive\Andres Marquez\UBIQUM\Project 4\Task 2\trainingData.csv')
df_wap = df.iloc[:, 0:520]
df_wap.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0,
df.iloc[:, 0:520] + 105,
df.iloc[:, 0:520] - 100)
df = pd.concat([df_wap, df.iloc[:,520:]], axis=1)
validation = pd.read_csv(r'C:\Users\andre\OneDrive\Andres Marquez\UBIQUM\Project 4\Task 2\ValidationData.csv')
validation_wap = validation.iloc[:, 0:520]
validation_wap.iloc[:, 0:520] = np.where(validation.iloc[:, 0:520] <= 0,
validation.iloc[:, 0:520] + 105,
validation.iloc[:, 0:520] - 100)
validation = pd.concat([validation_wap, validation.iloc[:,520:]], axis=1)
df.head()
| WAP001 | WAP002 | WAP003 | WAP004 | WAP005 | WAP006 | WAP007 | WAP008 | WAP009 | WAP010 | ... | WAP520 | LONGITUDE | LATITUDE | FLOOR | BUILDINGID | SPACEID | RELATIVEPOSITION | USERID | PHONEID | TIMESTAMP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | -7541.2643 | 4.864921e+06 | 2 | 1 | 106 | 2 | 2 | 23 | 1371713733 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | -7536.6212 | 4.864934e+06 | 2 | 1 | 106 | 2 | 2 | 23 | 1371713691 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 0 | ... | 0 | -7519.1524 | 4.864950e+06 | 2 | 1 | 103 | 2 | 2 | 23 | 1371714095 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | -7524.5704 | 4.864934e+06 | 2 | 1 | 102 | 2 | 2 | 23 | 1371713807 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | -7632.1436 | 4.864982e+06 | 0 | 0 | 122 | 2 | 11 | 13 | 1369909710 |
5 rows × 529 columns
Handling duplicates.
df.duplicated().value_counts()
False 19300 True 637 dtype: int64
validation.duplicated().value_counts()
False 1111 dtype: int64
df = df.drop_duplicates()
df.duplicated().value_counts()
False 19300 dtype: int64
Handling missing values.
print(df.isnull().any().value_counts())
print(validation.isnull().any().value_counts())
False 529 dtype: int64 False 529 dtype: int64
Transform Timestamp Unix Format into date format for better understanding.
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'],unit='s')
validation['TIMESTAMP'] = pd.to_datetime(validation['TIMESTAMP'],unit='s')
Given that we see the buildings in a diagonal way, we are gonna do a transormation in order to see them in a horizontal (front) way.
fig = px.scatter(df, x="LONGITUDE", y="LATITUDE", color='BUILDINGID')
fig.show()
trueNorth = 28 * np.pi / 180
df['LONGITUDE_N'] = df['LONGITUDE'] * np.cos(trueNorth) - df['LATITUDE'] * np.sin(trueNorth)
df['LATITUDE_N'] = df['LONGITUDE'] * np.sin(trueNorth) + df['LATITUDE'] * np.cos(trueNorth)
validation['LONGITUDE_N'] = validation['LONGITUDE'] * np.cos(trueNorth) - validation['LATITUDE'] * np.sin(trueNorth)
validation['LATITUDE_N'] = validation['LONGITUDE'] * np.sin(trueNorth) + validation['LATITUDE'] * np.cos(trueNorth)
df.insert(520, 'LONGITUDE_N', df.pop('LONGITUDE_N'))
df.insert(520, 'LATITUDE_N', df.pop('LATITUDE_N'))
validation.insert(520, 'LONGITUDE_N', validation.pop('LONGITUDE_N'))
validation.insert(520, 'LATITUDE_N', validation.pop('LATITUDE_N'))
fig = px.scatter(df, x="LONGITUDE_N", y="LATITUDE_N", color='BUILDINGID')
fig.show()
Now that we now we did the right trasformation we are going to replace the previous values with the new ones.
df.drop(['LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
df.rename(columns={"LATITUDE_N": "LATITUDE", "LONGITUDE_N": "LONGITUDE"}, inplace=True)
validation.drop(['LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
validation.rename(columns={"LATITUDE_N": "LATITUDE", "LONGITUDE_N": "LONGITUDE"}, inplace=True)
Let's analyse more deep into the WAPs values and see what's happening
waps = pd.DataFrame(df.iloc[:,:520].agg(['max']).max())
waps = waps[waps[0]==0].reset_index()
waps.drop([0], axis=1, inplace=True)
waps.rename(columns={"index": "WAP"}, inplace=True)
waps_val = pd.DataFrame(validation.iloc[:,:520].agg(['max']).max())
waps_val = waps_val[waps_val[0]==0].reset_index()
waps_val.drop([0], axis=1, inplace=True)
waps_val.rename(columns={"index": "WAP"}, inplace=True)
waps_l = waps.merge(waps_val, on='WAP')
len(waps_l)
0
We could see that we have plenty of colums in both datasets with full column values of 0s, but non of them were the same between the datasets, so we can't take them off.
Let's analyze the Timestamp and see if there's something odd.
vc = df['TIMESTAMP'].value_counts()
print(vc[vc >= 10])
2013-06-20 08:01:27 15 2013-06-20 07:57:58 12 2013-06-20 09:21:24 12 2013-06-20 09:27:12 12 2013-06-20 07:57:57 11 2013-06-20 07:55:07 11 2013-06-20 08:01:31 10 2013-06-20 07:49:35 10 2013-06-20 10:07:06 10 2013-06-12 16:04:22 10 2013-06-20 07:42:38 10 2013-06-20 09:24:53 10 2013-06-20 07:55:08 10 Name: TIMESTAMP, dtype: int64
There's clearly something weird, way too many duplicates in the same time stamp, let's analyze it by each customer and see if any of them could be in a different place in the same time.
df[df.columns[-9:]].groupby(['TIMESTAMP', 'USERID']).count().sort_values(by='FLOOR',ascending=False).head()
| LATITUDE | LONGITUDE | FLOOR | BUILDINGID | SPACEID | RELATIVEPOSITION | PHONEID | ||
|---|---|---|---|---|---|---|---|---|
| TIMESTAMP | USERID | |||||||
| 2013-06-20 08:01:27 | 13 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2013-06-12 16:04:22 | 1 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2013-06-20 10:07:06 | 14 | 8 | 8 | 8 | 8 | 8 | 8 | 8 |
| 2013-06-20 08:09:54 | 14 | 8 | 8 | 8 | 8 | 8 | 8 | 8 |
| 2013-06-20 07:42:38 | 14 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
Odd Users: 13, 1, 14, 11, 9, 6, 7
df[(df['TIMESTAMP']=='2013-06-12 16:04:22')]
| WAP001 | WAP002 | WAP003 | WAP004 | WAP005 | WAP006 | WAP007 | WAP008 | WAP009 | WAP010 | ... | WAP520 | LATITUDE | LONGITUDE | FLOOR | BUILDINGID | SPACEID | RELATIVEPOSITION | USERID | PHONEID | TIMESTAMP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15955 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291947e+06 | -2.290712e+06 | 2 | 0 | 130 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 16220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 16488 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 16757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 17035 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 17311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 17587 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 17864 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291941e+06 | -2.290709e+06 | 2 | 0 | 140 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 18139 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291947e+06 | -2.290712e+06 | 2 | 0 | 130 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
| 18413 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4.291947e+06 | -2.290712e+06 | 2 | 0 | 130 | 2 | 1 | 14 | 2013-06-12 16:04:22 |
10 rows × 529 columns
After some analysis made in each of our users, we determine that users 13 and 1 have some odd values that cannot be trusted, therefore we'll be removing those values from our dataset.
L=[12168,12251]
rem1 = df.loc[L]
rem2 = df[(df['TIMESTAMP']=='2013-06-12 16:04:22')]
rows_rem = pd.concat([rem1,rem2])
df.drop(rows_rem.index.values.tolist(), axis=0, inplace=True)
Now, let's move on in our analysis, we'll be plotting the building in 3D and see if we see any anomalies in it.
fig = px.scatter_3d(df, x="LONGITUDE", y="LATITUDE", z='FLOOR', color = 'BUILDINGID', opacity=0.7, color_continuous_scale=px.colors.sequential.Viridis)
fig.update_traces(marker=dict(size=4))
fig.update_layout(
width=950,
margin=dict(r=100, l=100, b=10, t=10))
fig.show()
for i_b in range(len(set(df['BUILDINGID']))):
print("building: ", i_b)
n_f_per_b = len(set(df[df['BUILDINGID']==i_b]['FLOOR']))
for i_f in range(n_f_per_b):
unique_user = set(df[(df['BUILDINGID']==i_b) & (df['FLOOR']==i_f)]['USERID'])
print("floor: ", i_f, "userID: ", unique_user)
print("\n")
building: 0
floor: 0 userID: {1, 11}
floor: 1 userID: {1, 11}
floor: 2 userID: {1, 11}
floor: 3 userID: {1, 11}
building: 1
floor: 0 userID: {7, 11, 13, 14, 16}
floor: 1 userID: {7, 11, 13, 14, 16}
floor: 2 userID: {9, 2, 10, 4}
floor: 3 userID: {8, 9, 10, 17, 18}
building: 2
floor: 0 userID: {8, 9, 10, 17}
floor: 1 userID: {7, 12, 15, 16, 18}
floor: 2 userID: {11, 5, 14}
floor: 3 userID: {2, 5, 6, 11, 14}
floor: 4 userID: {3, 13, 6}
It looks to be a missing corner in our 3rd building, also, the distribution in the first three floors seem a little bit odd.
numbers = [3,8]
fig = px.scatter_3d(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 12, 15, 16, 18])) & (df['SPACEID']<200)], x="LONGITUDE", y="LATITUDE",
z='FLOOR', color= 'RELATIVEPOSITION',opacity=0.7,color_continuous_scale=px.colors.sequential.Viridis)
fig.update_traces(marker=dict(size=4))
fig.update_layout(
width=950,
margin=dict(r=100, l=100, b=10, t=10))
fig.show()
fig = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 15, 12, 16, 18]))], x="LONGITUDE", y="LATITUDE",
color='USERID', color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(height=400, width=800, title_text="Including 12")
fig2 = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 15, 16, 18]))], x="LONGITUDE", y="LATITUDE",
color='USERID', color_continuous_scale=px.colors.sequential.Viridis)
fig2.update_layout(height=400, width=800, title_text="Excluding 12")
fig.show()
fig2.show()
It seems like our User #12 has no impact in one of the sides of the building, curiously, that's the side we are missing to be complete in our 5th floor, so let's plot it including it and see how it looks.
fig = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([12, 3])) & (df['SPACEID']>=200)], x="LONGITUDE", y="LATITUDE",
color='USERID', color_continuous_scale=px.colors.sequential.Viridis)
fig.show()
df[(df['BUILDINGID']==2)]['FLOOR'].value_counts()
3 2708 1 2162 0 1942 2 1577 4 727 Name: FLOOR, dtype: int64
Interesting take, it seems like one of the sides that our User #12 recorded doesn't belong to the FloorId #1 but to the FloorId #4, given that we can't ask the company and validate this information, all we can do is an assumption. We believe by our analysis that this is true, there was a problem with the user #12 during the recording of one of the sides of the floors and it was assigned to the 2nd floor when it should've been assigned to the 5th floors. However, given that we cannot validate this information, we'll keep the situation as it is and keep with our analysis because we can't change the situation only by that analysis and assumption.
#df1 = df[(df['BUILDINGID']==2) & (df['USERID'].isin([12])) & (df['SPACEID']>=200)]
#df1['FLOOR'] = df1['FLOOR'].replace(1,4)
#df.loc[df1.index.values.tolist()] = df1
#fig = px.scatter_3d(df[(df['BUILDINGID']==2)], x="LONGITUDE", y="LATITUDE", z='FLOOR', color = 'FLOOR', opacity=0.7, color_continuous_scale=px.colors.sequential.Viridis)
#fig.update_traces(marker=dict(size=4))
##fig.update_layout(
#width=950,
#margin=dict(r=100, l=100, b=10, t=10))
#fig.show()
Given that in our validation data we don't have any information about the SpaceId and the RelativePosition, we'll not be using this variables for our further analysis and predictions. Also, there's no theory to support that timestamp and PhoneId can help us predict the location of a person, we'll be taking that variable off too.
X = np.asarray(df.iloc[:,0:520])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:520])
y_val = np.asarray(validation.iloc[:,523:524])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)
print ('Validation set:', X_val.shape, y_val.shape)
Train set: (15430, 520) (15430, 1) Test set: (3858, 520) (3858, 1) Validation set: (1111, 520) (1111, 1)
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
pred_val=neigh.predict(X_val)
mean_acc[n-1] = metrics.accuracy_score(y_val, pred_val)
std_acc[n-1]=np.std(pred_val==y_val)/np.sqrt(pred_val.shape[0])
mean_acc
print( "The best accuracy was with", round(mean_acc.max(),4), "with k=", mean_acc.argmax()+1)
The best accuracy was with 0.9937 with k= 2
k = mean_acc.argmax()+1
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
pred = neigh.predict(X_test)
pred_val = neigh.predict(X_val)
print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, neigh.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, pred_val),4))
Train set Accuracy: 0.9982 Test set Accuracy: 0.9974 Validation set Accuracy: 0.9937
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, clf.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, y_pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, y_val_pred),4))
Train set Accuracy: 0.9957 Test set Accuracy: 0.9933 Validation set Accuracy: 0.9928
X = np.asarray(df.iloc[:,0:520])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:520])
y_val = np.asarray(validation.iloc[:,523:524])
algos_Class = []
algos_Class.append(('Random Forest Classifier', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier', DecisionTreeClassifier()))
algos_Class.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
results = []
names = []
for name, model in algos_Class:
result = cross_val_score(model, X, y, cv=3, scoring='accuracy')
names.append(name)
results.append(result)
for i in range(len(names)):
print(names[i],results[i].mean())
Random Forest Classifier 0.9881268303672908 Decision Tree Classifier 0.9878677577246248 Gradient Boosting Classifier 0.9910307275523259
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
dtc = DecisionTreeClassifier()
gbm = GradientBoostingClassifier()
rfc = RandomForestClassifier()
dtc.fit(X_train, y_train)
gbm.fit(X_train, y_train)
rfc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_val)
gbm_pred = gbm.predict(X_val)
rfc_pred = rfc.predict(X_val)
print("DTC Validation set Accuracy: ", metrics.accuracy_score(y_val, dtc_pred))
print("GBM Validation set Accuracy: ", metrics.accuracy_score(y_val, gbm_pred))
print("RFC Validation set Accuracy: ", metrics.accuracy_score(y_val, rfc_pred))
DTC Validation set Accuracy: 0.9720972097209721 GBM Validation set Accuracy: 0.9981998199819982 RFC Validation set Accuracy: 0.9990999099909991
It seems that 2/3 of the decision trees models are giving us nearly perfect results when predicting the building, we'll be using those prediction and turn our attention now into predicting the floor.
validation.insert(520, 'BUILDING_PRED', rfc_pred)
df.insert(520, 'BUILDINGID', df.pop('BUILDINGID'))
X = np.asarray(df.iloc[:,0:521])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:521])
y_val = np.asarray(validation.iloc[:,523:524])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)
print ('Validation set:', X_val.shape, y_val.shape)
Train set: (15430, 521) (15430, 1) Test set: (3858, 521) (3858, 1) Validation set: (1111, 521) (1111, 1)
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
pred_val=neigh.predict(X_val)
mean_acc[n-1] = metrics.accuracy_score(y_val, pred_val)
std_acc[n-1]=np.std(pred_val==y_val)/np.sqrt(pred_val.shape[0])
mean_acc
print( "The best accuracy was with", round(mean_acc.max(),4), "with k=", mean_acc.argmax()+1)
The best accuracy was with 0.8272 with k= 16
k = mean_acc.argmax()+1
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
pred = neigh.predict(X_test)
pred_val = neigh.predict(X_val)
print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, neigh.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, pred_val),4))
Train set Accuracy: 0.9798 Test set Accuracy: 0.9705 Validation set Accuracy: 0.8272
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, clf.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, y_pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, y_val_pred),4))
Train set Accuracy: 0.9062 Test set Accuracy: 0.9046 Validation set Accuracy: 0.8434
X = np.asarray(df.iloc[:,0:521])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:521])
y_val = np.asarray(validation.iloc[:,523:524])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
dtc = DecisionTreeClassifier()
gbm = GradientBoostingClassifier()
rfc = RandomForestClassifier()
dtc.fit(X_train, y_train)
gbm.fit(X_train, y_train)
rfc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_val)
gbm_pred = gbm.predict(X_val)
rfc_pred = rfc.predict(X_val)
print("DTC Validation set Accuracy: ", round(metrics.accuracy_score(y_val, dtc_pred),4))
print("GBM Validation set Accuracy: ", round(metrics.accuracy_score(y_val, gbm_pred),4))
print("RFC Validation set Accuracy: ", round(metrics.accuracy_score(y_val, rfc_pred),4))
DTC Validation set Accuracy: 0.7975 GBM Validation set Accuracy: 0.8767 RFC Validation set Accuracy: 0.9091
Random Forest Classiffier seems to be our best model in order to predict the floor position of a person, next, we'll try to predict the longitude and latitude.
validation.insert(521, 'FLOOR_PRED', rfc_pred)
df.insert(521, 'FLOOR', df.pop('FLOOR'))
X = np.asarray(df.iloc[:,0:522])
y = np.asarray(df['LONGITUDE'])
X_val = np.asarray(validation.iloc[:,0:522])
y_val = np.asarray(validation['LONGITUDE'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 2.13 meters.
predictions = rf.predict(X_val)
# Calculate the absolute errors
errors = abs(predictions - y_val)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 7.71 meters.
validation.insert(522, 'LONGITUDE_PRED', predictions)
df.insert(522, 'LONGITUDE', df.pop('LONGITUDE'))
X = np.asarray(df.iloc[:,0:523])
y = np.asarray(df['LATITUDE'])
X_val = np.asarray(validation.iloc[:,0:523])
y_val = np.asarray(validation['LATITUDE'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 1.15 meters.
predictions = rf.predict(X_val)
# Calculate the absolute errors
errors = abs(predictions - y_val)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 7.73 meters.
validation.insert(523, 'LATITUDE_PRED', predictions)
df.insert(523, 'LATITUDE', df.pop('LATITUDE'))
MAE = np.mean(abs(np.sqrt(((validation['LONGITUDE'] - validation['LONGITUDE_PRED'])) **2
+ (validation['LATITUDE'] - validation['LATITUDE_PRED']) **2)))
print('Accurate between a range of', round(MAE,2), 'meters.')
Accurate between a range of 12.14 meters.
It looks like our cascade method if giving us decent results, our prediction has an accuracy of around 100% when determing the building the person is, 91% when determine the floor in which person is, and is also accurate in a range of around 12 meters when determing its position.